setwd('C:\\Users\\v\\Documents\\My Documents full backup\\ODU\\Modern topics seminar\\Day 03') library(lme4) # load library ###################################### #GLM #http://www.theanalysisfactor.com/glm-r-overdispersion-count-regression/ #Counts of students diagnosed as infected, number of days from initial outbreak cases=read.csv("cases example.csv") attach(cases) head(cases) #use defaul link function (natural log) model1 <- glm(Students ~ Days, family=poisson, data=cases) summary(model1) #The negative coefficient for Days indicates that as days increase, the mean number of students #with the disease is smaller. This coefficient is highly significant (p < 2e-16). #If the residual deviance were greater than the degrees of freedom, #there would be over-dispersion. This means that there is extra variance not accounted for #by the model or by the error structure. This is a very important model assumption, #so would have to re-fit the model using quasi poisson errors. #allows overdispersion parameter to be estimated model2 <- glm(Students ~ Days, family=quasipoisson, data=cases) summary(model2) #dispersion parameter is less than 1, so no overdispersion #use model 1 summary(model1) #OR model1$coefficients #We can use the residual deviance to perform a goodness of fit test for the overall model. #The residual deviance is the difference between the deviance of the current model and #the maximum deviance of the ideal model where the predicted values are identical to the observed. #Therefore, if the residual difference is small enough, the goodness of fit test will #not be significant, indicating that the model fits the data. We conclude that the model #fits reasonably well because the goodness-of-fit chi-squared test is not statistically significant. #If the test had been statistically significant, it would indicate that the data do not fit the model well. #In that situation, we may try to determine if there are omitted predictor variables, #if our linearity assumption holds and/or if there is an issue of over-dispersion. with(model1, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail=FALSE))) #check model with diagnostic plots plot(model1) Y <- predict(model1) #in log scale Y2 <- exp(Y) #exponentiate of the fitted values to back-transform log scale to original data. write.csv(Y2, "Y2 exp.csv") #or use type="response" Z <- predict(model1, type="response") plot(Days, Students, xlab = "DAYS", ylab = "STUDENTS", pch = 16) lines(Y2, lwd = 3, col = "blue") lines(Z, lwd = 3, col = "red") #Let's calculate the impact on the number of cases arising from a one day increase along the time axis. First we take the exponential of the coefficients. coeffs <- exp(coef(model1)) coeffs #We calculate the 95% confidence interval (upper and lower confidence limits) as follows: CI <- exp(confint.default(model1)) CI #We can calculate the change in number of students presenting with the disease for each additional day, as follows: 1 - 0.9826884 #The reduction (rate ratio) is approximately 0.02 cases for each additional day. detach(cases) #################################### ####################################### ####GLMER #####simple repeated measures random effects model #Frequently we are not as interested in comparing the #particular subjects in the study as much as we are interested #in modeling the variability in the population from which the #subjects were chosen. #the individual or block which is repeatedly measured is the random effect #nuisance variable that must be included library(npmlreg) library(car) data(irlsuicide) head(irlsuicide) attach(irlsuicide) #Region : 13 different health regions of Ireland. #ID : factor with levels 1 2 3 . 12 13 corresponding to Regions. #pop : a numeric vector giving the population size of each suppopulation group. #death :a numeric vector giving the total number of deaths of each suppopulation group. #sex : a factor for gender with levels 0 (female) and 1 (male). #age : a factor for age with levels 1 (0-29), 2 (30-39), 3 (40-59), 4(60+ years). #smr : a numeric vector with standardized mortality ratios (SMRs) #expected : a numeric vector with `expected' number of cases obtained from a reference population (Ahlbom, 1993). #Response is "death", a count #examine Death by Region (the random factor) plot(death ~ Region, data=irlsuicide, pch=16,col="lightblue") #examine the data hist(death) #skewed, all positivie, Poisson distribution works #quick tables of the data (xtabs(death~ Region, data = irlsuicide)) #How do sex and age affect deaths, have to accout for Region # random intercept - The intercept for Region can vary. Sex and age are fixed. m1_res <-glmer(death ~ (1|Region) + sex + age, family= poisson, data = irlsuicide) summary(m1_res) #The assumption of Poisson distribution is that the variance is equal to the constant mean. #So the residuals have the same constant homogeneous assumption. If the data fits to a #Poisson distribution, then dispersion should be 1. However, dispersion can differ from 1 #when there is a random effect or there are clusters of observations. plot(m1_res) #overdispersion is if its much bigger than 1 library(RVAideMemoire) overdisp.glmer(m1_res) #The residual plot shows the nonhomogeneous variace property of the Irish Suicide data. #The overdispersion calculaton also returns greater than 1. #Therefore, further analyses such as examining over-dispersion, using other distributions, #centering the variables, or transformation of variables can be considered #for better model fitting without overdispersion problem. #Note: ##One ***possible*** approach is to create an observation-level random effect #Create a new dataset with one level per observation X=1:104 newirlsuicide=cbind(X,irlsuicide) #include that new variable as a random effect m2_res <-glmer(death ~ (1|Region)+(1|X) + sex + age , family= poisson, data = newirlsuicide) summary(m2_res) #test overdispersion again overdisp.glmer(m2_res) #You'd have to investigate if this were really approapriate for your issue. #Other approaches are possible. detach(irlsuicide)